# -*- coding:utf-8 -*-

import pdfplumber
import pandas as pd

"""
pdfplumber 处理表格
参考：https://blog.csdn.net/Asher117/article/details/89203780
"""

# with pdfplumber.open("1.PDF") as pdf:
#     content = ''
#     #len(pdf.pages)为PDF文档页数
#     for i in range(len(pdf.pages)):
#         #pdf.pages[i] 是读取PDF文档第i+1页
#         page = pdf.pages[i]
#         #page.extract_text()函数即读取文本内容，下面这步是去掉文档最下面的页码
#         page_content = page.extract_text()
#         content = content + str(page_content)
#     print(content)

with pdfplumber.open("000037深南电年报.pdf") as pdf:
    page_count = len(pdf.pages)  # PDF总页数
    page0 = pdf.pages[1]  # PDF页号
    texts = page0.extract_text()  # 获得text文本值
    tables = page0.extract_tables()  # 获得该页的表格
    table = page0.extract_tables(table_settings={})
    # df = pd.DataFrame(tables[1:],columns=tables[0])  # 第一列当成表头
    table = pdf.pages[1].extract_tables(table_settings={})
    for t in table:
        df = pd.DataFrame(t[1:], columns=t[0])
        print(df)